# Importing libraries
%matplotlib inline
import numpy as np
import pandas as pd
import glob
import matplotlib.pyplot as plt
import matplotlib.dates as dates
import seaborn as sns
from scipy.stats import pearsonr
import plotly.express as px
import plotly.graph_objects as go
# Regression models
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
# Cluster model
from sklearn.cluster import KMeans
# Evaluation tools
from sklearn.metrics import ConfusionMatrixDisplay
# Cross-validation
from sklearn.model_selection import cross_val_score
# Path to directory containing CSV files for each month
path = "./Dataset"
# Getting list of CSV files in the directory
all_files = glob.glob(path + "/dublinbike-historical-data-*.csv")
# List to store DataFrames for each month
dfs = []
# Iterating over each CSV file
for filename in all_files:
# Reading CSV file into a DataFrame
df = pd.read_csv(filename)
# Extracting month from the filename
month = filename.split('-')[-1].split('.')[0]
# Adding a new column for month
df['Month'] = month
# Appending DataFrame to the list
dfs.append(df)
# Concatenating all DataFrames into one
merged_df = pd.concat(dfs, ignore_index=True)
# Printing number of rows in merged dataset
print("\nThe total rows in merged dataset is:", len(merged_df))
rows_in_each_dataset = [len(df) for df in dfs]
# Printing number of rows in each dataset
print("\nNumber of rows in each individual dataset:")
count = 0
for i, rows in enumerate(rows_in_each_dataset, start=1):
print(f"Dataset {i}: {rows} rows")
count = count + rows
print("\nThe total rows are:", count)
# Printing the head of the merged dataset to get an overview
merged_df.head()
# Checking for missing values in key columns
print("\nMissing values in each column:\n", merged_df.isnull().sum())
# Saving the merged DataFrame to a new CSV file (as backup)
merged_df.to_csv('merged_data.csv', index=False)
The total rows in merged dataset is: 1994400 Number of rows in each individual dataset: Dataset 1: 168144 rows Dataset 2: 152903 rows Dataset 3: 169844 rows Dataset 4: 164160 rows Dataset 5: 169367 rows Dataset 6: 163590 rows Dataset 7: 169632 rows Dataset 8: 169404 rows Dataset 9: 164046 rows Dataset 10: 169746 rows Dataset 11: 164046 rows Dataset 12: 169518 rows The total rows are: 1994400 Missing values in each column: STATION ID 0 TIME 0 LAST UPDATED 0 NAME 0 BIKE_STANDS 0 AVAILABLE_BIKE_STANDS 0 AVAILABLE_BIKES 0 STATUS 0 ADDRESS 0 LATITUDE 0 LONGITUDE 0 Month 0 dtype: int64
# Replacing 'datetime_column' with the name of actual datetime column
merged_df['datetime_column'] = pd.to_datetime(merged_df['TIME'])
# Creating a column for the day of the week (Monday=0, Sunday=6)
merged_df['day_of_week'] = merged_df['datetime_column'].dt.dayofweek
# Creating a column for the hour of the day
merged_df['hour_of_day'] = merged_df['datetime_column'].dt.hour
# Defining a function to check if a given row matches the desired time slots
def check_time_slot(row):
if row['day_of_week'] <= 4: # Monday to Friday
return ((7 <= row['hour_of_day'] < 10) or (12 <= row['hour_of_day'] < 13) or (17 <= row['hour_of_day'] < 20))
else: # Saturday and Sunday
return (10 <= row['hour_of_day'] < 17)
# Applying the function to filter the rows
filtered_df = merged_df[merged_df.apply(check_time_slot, axis=1)]
# Mapping from integers to weekday names
day_of_week_map = {
0: 'Monday',
1: 'Tuesday',
2: 'Wednesday',
3: 'Thursday',
4: 'Friday',
5: 'Saturday',
6: 'Sunday'
}
# Creating a new column 'weekday_name' by applying the mapping to the 'day_of_week' column
filtered_df['weekday_name'] = filtered_df['day_of_week'].map(day_of_week_map)
# Printing the head of the modified dataset to get an overview
filtered_df.head()
# Printing the number of rows in filtered dataset
print("The total rows in filtered dataset is:",len(filtered_df))
# Printing the names of the columns in the filtered dataset
print("\nColumn names in the filtered dataset:", list(filtered_df.columns))
# Saving the filtered dataset for further use (as backup)
filtered_df.to_csv('filtered_data.csv', index=False)
C:\Users\Dell\AppData\Local\Temp\ipykernel_43472\4262330997.py:32: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy filtered_df['weekday_name'] = filtered_df['day_of_week'].map(day_of_week_map)
The total rows in filtered dataset is: 581693 Column names in the filtered dataset: ['STATION ID', 'TIME', 'LAST UPDATED', 'NAME', 'BIKE_STANDS', 'AVAILABLE_BIKE_STANDS', 'AVAILABLE_BIKES', 'STATUS', 'ADDRESS', 'LATITUDE', 'LONGITUDE', 'Month', 'datetime_column', 'day_of_week', 'hour_of_day', 'weekday_name']
# Loading the dataset for use
# Note that the data that was prepared for analysis (filtered_data.csv) is used
dataset = pd.read_csv("filtered_data.csv")
# Removing stations that has "Closed" status and also removing STATION ID = 507 (test station)
unique_statuses = dataset['STATUS'].unique()
print("The unique status of the stations are:", unique_statuses) # Print the unique values
closed_count = (dataset['STATUS'] == 'CLOSED').sum() # Count occurrences of "CLOSED" in the STATUS column
print("\nNumber of occurrences of 'CLOSED' in STATUS column:", closed_count) # Print the count
closed_stations = dataset[dataset['STATUS'] == 'CLOSED'] # Filter the dataset for rows where STATUS is "CLOSED"
dataset_unique = dataset[dataset['STATUS'] != 'CLOSED'] # Drop rows where STATUS is "CLOSED"
# Filtering the dataset for rows where STATION ID is 507
dataset_unique = dataset_unique[dataset_unique['STATION ID'] != 507]
print("\n",dataset_unique.head()) # Print the head of the updated dataset
The unique status of the stations are: ['OPEN' 'CLOSED']
Number of occurrences of 'CLOSED' in STATUS column: 70
STATION ID TIME LAST UPDATED NAME \
0 1 2023-01-01 10:00:03 2023-01-01 09:57:04 CLARENDON ROW
1 2 2023-01-01 10:00:03 2023-01-01 09:55:57 BLESSINGTON STREET
2 3 2023-01-01 10:00:03 2023-01-01 09:51:11 BOLTON STREET
3 4 2023-01-01 10:00:03 2023-01-01 09:51:33 GREEK STREET
4 5 2023-01-01 10:00:03 2023-01-01 09:58:41 CHARLEMONT PLACE
BIKE_STANDS AVAILABLE_BIKE_STANDS AVAILABLE_BIKES STATUS \
0 31 29 2 OPEN
1 20 18 2 OPEN
2 20 9 11 OPEN
3 20 6 14 OPEN
4 40 16 24 OPEN
ADDRESS LATITUDE LONGITUDE Month datetime_column \
0 Clarendon Row 53.3409 -6.26250 1 2023-01-01 10:00:03
1 Blessington Street 53.3568 -6.26814 1 2023-01-01 10:00:03
2 Bolton Street 53.3512 -6.26986 1 2023-01-01 10:00:03
3 Greek Street 53.3469 -6.27298 1 2023-01-01 10:00:03
4 Charlemont Street 53.3307 -6.26018 1 2023-01-01 10:00:03
day_of_week hour_of_day weekday_name
0 6 10 Sunday
1 6 10 Sunday
2 6 10 Sunday
3 6 10 Sunday
4 6 10 Sunday
# Convert 'datetime_column' to datetime
dataset_unique['datetime_column'] = pd.to_datetime(dataset_unique['datetime_column'])
# Extracting year, month, and day
dataset_unique['Year'] = dataset_unique['datetime_column'].dt.year
dataset_unique['Month'] = dataset_unique['datetime_column'].dt.month
dataset_unique['Day'] = dataset_unique['datetime_column'].dt.day
# Grouping by 'STATION ID', year, month, and day, and calculating mean of 'AVAILABLE_BIKE_STANDS' and 'BIKE_STANDS'
daily_means = dataset_unique.groupby(['STATION ID', 'Year', 'Month', 'Day']).agg({'AVAILABLE_BIKE_STANDS': 'mean', 'BIKE_STANDS': 'mean'}).reset_index()
# Calculating usage percentage for each station on a daily basis
daily_means['Usage Percentage'] = (daily_means['BIKE_STANDS'] - daily_means['AVAILABLE_BIKE_STANDS']) / daily_means['BIKE_STANDS'] * 100
# Merging the 'Usage Percentage' column from daily_means to the original dataset
dataset_unique = pd.merge(dataset_unique, daily_means[['STATION ID', 'Year', 'Month', 'Day', 'Usage Percentage']], on=['STATION ID', 'Year', 'Month', 'Day'], how='left')
# Removing duplicate cases based on 'Month', 'Day', and 'STATION ID'
dataset_unique = dataset_unique.drop_duplicates(subset=['Month', 'Day', 'STATION ID'])
# Displaying the head of the updated dataset
print(dataset_unique.head())
STATION ID TIME LAST UPDATED NAME \
0 1 2023-01-01 10:00:03 2023-01-01 09:57:04 CLARENDON ROW
1 2 2023-01-01 10:00:03 2023-01-01 09:55:57 BLESSINGTON STREET
2 3 2023-01-01 10:00:03 2023-01-01 09:51:11 BOLTON STREET
3 4 2023-01-01 10:00:03 2023-01-01 09:51:33 GREEK STREET
4 5 2023-01-01 10:00:03 2023-01-01 09:58:41 CHARLEMONT PLACE
BIKE_STANDS AVAILABLE_BIKE_STANDS AVAILABLE_BIKES STATUS \
0 31 29 2 OPEN
1 20 18 2 OPEN
2 20 9 11 OPEN
3 20 6 14 OPEN
4 40 16 24 OPEN
ADDRESS LATITUDE LONGITUDE Month datetime_column \
0 Clarendon Row 53.3409 -6.26250 1 2023-01-01 10:00:03
1 Blessington Street 53.3568 -6.26814 1 2023-01-01 10:00:03
2 Bolton Street 53.3512 -6.26986 1 2023-01-01 10:00:03
3 Greek Street 53.3469 -6.27298 1 2023-01-01 10:00:03
4 Charlemont Street 53.3307 -6.26018 1 2023-01-01 10:00:03
day_of_week hour_of_day weekday_name Year Day Usage Percentage
0 6 10 Sunday 2023 1 7.603687
1 6 10 Sunday 2023 1 27.142857
2 6 10 Sunday 2023 1 46.428571
3 6 10 Sunday 2023 1 63.214286
4 6 10 Sunday 2023 1 58.214286
# Setting the target variable
y = dataset_unique['Usage Percentage']
# Setting the feature variable
X_new = dataset_unique[['STATION ID', 'AVAILABLE_BIKES', 'day_of_week']]
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)
models = {
"Ridge Base": Ridge(),
"Lasso Base": Lasso(),
"ElasticNet Base": ElasticNet(),
'Linear Regression': LinearRegression(), # Linear Regression
'Random Forest Regressor': RandomForestRegressor(random_state=42) # Random Forest Regressor
}
# Calculating the MSE value for all the models used
for name, model in models.items():
model.fit(X_train, y_train)
y_pred_test = model.predict(X_test)
y_pred_train = model.predict(X_train)
mse_test = mean_squared_error(y_test, y_pred_test)
mse_train = mean_squared_error(y_train, y_pred_train)
print(f"{name} (MSE_Test) = {mse_test}")
print(f"{name} (MSE_Train) = {mse_train}")
Ridge Base (MSE_Test) = 296.7166429908603 Ridge Base (MSE_Train) = 285.8666307351733 Lasso Base (MSE_Test) = 297.12323957199254 Lasso Base (MSE_Train) = 286.12717398594134 ElasticNet Base (MSE_Test) = 296.96748896892075 ElasticNet Base (MSE_Train) = 285.9860569449409 Linear Regression (MSE_Test) = 296.7166390299895 Linear Regression (MSE_Train) = 285.8666307351055 Random Forest Regressor (MSE_Test) = 174.63904270377736 Random Forest Regressor (MSE_Train) = 77.43153515587545
# Making predictions on the test set using the Random Forest Regressor model
y_pred_rf = models['Random Forest Regressor'].predict(X_test)
# Plotting graph to check the spread of fitted and actual values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_rf, color='blue', label='Predicted usage')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--', label='Actual usage')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Random Forest Regressor Predictions vs. Actual Values of Usage')
plt.legend()
plt.grid(True)
plt.show()
# Making predictions for all stations
dataset_unique['Usage_predictions'] = models['Random Forest Regressor'].predict(X_new)
# Removing additional columns and duplicate cases
# Note that each row in the dataset corresponds to daily average usage level of bikes for a station
data_pred = dataset_unique.groupby('STATION ID').agg({
'NAME': 'first',
'BIKE_STANDS': 'mean',
'AVAILABLE_BIKE_STANDS': 'mean',
'AVAILABLE_BIKES': 'mean',
'LATITUDE': 'mean',
'LONGITUDE': 'mean',
'Usage_predictions': 'mean'}).reset_index()
print('The dataset with predicted values for daily usage level for each station is:\n', data_pred)
# Selecting top 5 and bottom 5 stations in terms of usage
top_5_predicted = data_pred.nlargest(5, 'Usage_predictions')
bottom_5_predicted = data_pred.nsmallest(5, 'Usage_predictions')
print('\nPREDICTED: TOP 5 STATIONS\n', top_5_predicted[['STATION ID', 'NAME', 'Usage_predictions']])
print('\nPREDICTED: BOTTOM 5 STATIONS\n', bottom_5_predicted[['STATION ID', 'NAME', 'Usage_predictions']])
# Saving the dataset for further use (Task 2)
data_pred.to_csv('dataset_with_predictions.csv', index=False)
The dataset with predicted values for daily usage level for each station is:
STATION ID NAME BIKE_STANDS AVAILABLE_BIKE_STANDS \
0 1 CLARENDON ROW 31.0 16.961644
1 2 BLESSINGTON STREET 20.0 14.687671
2 3 BOLTON STREET 20.0 12.476712
3 4 GREEK STREET 20.0 11.194521
4 5 CHARLEMONT PLACE 40.0 12.410959
.. ... ... ... ...
109 113 MERRION SQUARE SOUTH 40.0 30.624658
110 114 WILTON TERRACE (PARK) 40.0 27.139726
111 115 KILLARNEY STREET 30.0 8.564384
112 116 BROADSTONE 30.0 27.895890
113 117 HANOVER QUAY EAST 40.0 31.742466
AVAILABLE_BIKES LATITUDE LONGITUDE Usage_predictions
0 14.024658 53.3409 -6.26250 50.898496
1 5.304110 53.3568 -6.26814 18.881614
2 7.506849 53.3512 -6.26986 37.645840
3 8.775342 53.3469 -6.27298 47.050910
4 27.553425 53.3307 -6.26018 42.660139
.. ... ... ... ...
109 7.463014 53.3386 -6.24861 27.959888
110 12.490411 53.3337 -6.24834 40.782061
111 21.402740 53.3548 -6.24758 46.850345
112 2.095890 53.3547 -6.27231 5.300500
113 8.241096 53.3437 -6.23175 30.305323
[114 rows x 8 columns]
PREDICTED: TOP 5 STATIONS
STATION ID NAME Usage_predictions
88 92 HEUSTON BRIDGE (NORTH) 68.527906
9 10 DAME STREET 66.885357
32 33 PRINCES STREET / O'CONNELL STREET 66.261155
22 23 CUSTOM HOUSE 62.728942
21 22 TOWNSEND STREET 61.522133
PREDICTED: BOTTOM 5 STATIONS
STATION ID NAME Usage_predictions
112 116 BROADSTONE 5.300500
107 111 MOUNTJOY SQUARE EAST 6.458657
85 89 FITZWILLIAM SQUARE EAST 13.520390
100 104 GRANGEGORMAN LOWER (CENTRAL) 15.565386
59 61 HARDWICKE PLACE 15.660697
# Plotting heatmap corresponding to daily average usage level of bikes for each station
fig = px.scatter_mapbox(data_pred,
lat='LATITUDE',
lon='LONGITUDE',
hover_name='NAME',
hover_data=['Usage_predictions', 'STATION ID'],
color='Usage_predictions',
size='Usage_predictions',
size_max=20,
color_continuous_scale='Jet',
mapbox_style="carto-positron",
zoom=12)
# Updating layout
fig.update_layout(title='Daily Average Usage Percentage Heatmap for Dublin Bikes')
# Showing the figure
fig.show()
# Loading the dataset for use
data_pred = pd.read_csv("dataset_with_predictions.csv")
print(data_pred) #Printing the dataset to visualize
STATION ID NAME BIKE_STANDS AVAILABLE_BIKE_STANDS \
0 1 CLARENDON ROW 31.0 16.961644
1 2 BLESSINGTON STREET 20.0 14.687671
2 3 BOLTON STREET 20.0 12.476712
3 4 GREEK STREET 20.0 11.194521
4 5 CHARLEMONT PLACE 40.0 12.410959
.. ... ... ... ...
109 113 MERRION SQUARE SOUTH 40.0 30.624658
110 114 WILTON TERRACE (PARK) 40.0 27.139726
111 115 KILLARNEY STREET 30.0 8.564384
112 116 BROADSTONE 30.0 27.895890
113 117 HANOVER QUAY EAST 40.0 31.742466
AVAILABLE_BIKES LATITUDE LONGITUDE Usage_predictions
0 14.024658 53.3409 -6.26250 50.898496
1 5.304110 53.3568 -6.26814 18.881614
2 7.506849 53.3512 -6.26986 37.645840
3 8.775342 53.3469 -6.27298 47.050910
4 27.553425 53.3307 -6.26018 42.660139
.. ... ... ... ...
109 7.463014 53.3386 -6.24861 27.959888
110 12.490411 53.3337 -6.24834 40.782061
111 21.402740 53.3548 -6.24758 46.850345
112 2.095890 53.3547 -6.27231 5.300500
113 8.241096 53.3437 -6.23175 30.305323
[114 rows x 8 columns]
# Feature set for clustering
X = data_pred[['Usage_predictions', 'LATITUDE', 'LONGITUDE']]
# Specifying the number of clusters
n_clusters = 6
# Performing K-means clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(X)
# Getting the cluster labels
labels = kmeans.labels_
# Adding cluster labels to the DataFrame
data_pred['cluster_kmeans'] = labels
data_pred['cluster_kmeans'] = data_pred['cluster_kmeans'].astype(str)
# Setting the colour scheme for the clusters
cluster_colors1 = {
'0': 'green',#green
'1': 'red', #red
'2': 'orange', #orange
'3': 'blue', #blue
'4': 'brown', #brown
'5': 'black'#black
}
# Plotting with Plotly Express
fig = px.scatter_mapbox(data_pred,
lat="LATITUDE",
lon="LONGITUDE",
color="cluster_kmeans",
color_discrete_map=cluster_colors1,
size="Usage_predictions",
size_max=15,
zoom=12,
mapbox_style="carto-positron",
title="KMeans Dublin Clusters (6)"
)
fig.show()
C:\Users\Dell\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1446: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
# Creating clusters for high and low usage areas
# Defining the conditions and corresponding cluster values
conditions = [
data_pred['STATION ID'].isin([93, 100, 92, 86, 94]),
data_pred['STATION ID'].isin([9, 14, 10, 1, 27, 29, 6, 17]),
data_pred['STATION ID'].isin([16, 23, 22, 33, 32, 21, 8, 38, 64, 99]),
data_pred['STATION ID'].isin([103, 104, 105, 102, 116, 110, 2, 30, 12]),
data_pred['STATION ID'].isin([111, 112, 28, 44, 60, 59, 61, 79, 78, 15]),
data_pred['STATION ID'].isin([89, 13, 20, 39, 114, 19, 47])
]
cluster_values = [1, 2, 3, 4, 5, 6]
# Setting the colour scheme for the clusters
cluster_colors2 = {
1: 'red',
2: 'orange',
3: 'brown',
4: 'blue',
5: 'green',
6: 'black'
}
# Creating the 'Cluster' variable based on the conditions
data_pred['Cluster'] = np.select(conditions, cluster_values, default=None)
print(data_pred.head()) #Printing the head of the modified dataset
# Plotting heatmap corresponding to daily average usage level of bikes for each station
fig = px.scatter_mapbox(data_pred,
lat='LATITUDE',
lon='LONGITUDE',
hover_name='NAME',
hover_data=['Usage_predictions', 'STATION ID', 'Cluster'],
color='Cluster',
color_discrete_map=cluster_colors2,
size='Usage_predictions',
size_max=20,
mapbox_style="carto-positron",
zoom=12)
# Updating layout
fig.update_layout(
title='Clusters for weight',
legend_title='Cluster', # Set legend title
legend=dict(
title='Cluster',
traceorder='normal', # Ensure numerical order
),
mapbox=dict(
center=dict(lat=data_pred['LATITUDE'].mean(), lon=data_pred['LONGITUDE'].mean()), # Set map center
zoom=12 # Set initial zoom level
)
)
# Showing the figure
fig.show()
STATION ID NAME BIKE_STANDS AVAILABLE_BIKE_STANDS \ 0 1 CLARENDON ROW 31.0 16.961644 1 2 BLESSINGTON STREET 20.0 14.687671 2 3 BOLTON STREET 20.0 12.476712 3 4 GREEK STREET 20.0 11.194521 4 5 CHARLEMONT PLACE 40.0 12.410959 AVAILABLE_BIKES LATITUDE LONGITUDE Usage_predictions cluster_kmeans \ 0 14.024658 53.3409 -6.26250 50.898496 1 1 5.304110 53.3568 -6.26814 18.881614 4 2 7.506849 53.3512 -6.26986 37.645840 0 3 8.775342 53.3469 -6.27298 47.050910 3 4 27.553425 53.3307 -6.26018 42.660139 3 Cluster 0 2 1 4 2 None 3 None 4 None
# Removing rows with 'None' values in the 'Cluster' column
data = data_pred.dropna(subset=['Cluster'])
# Calculating total bike stands in each cluster
total_bike_stands_cluster = data.groupby('Cluster')['BIKE_STANDS'].sum()
print('The total bike stands in each cluster is:\n', total_bike_stands_cluster)
# Calculating total available bikes in each cluster using 'sum()' instead of 'mean()'
total_available_bikes_cluster = data.groupby('Cluster')['AVAILABLE_BIKES'].sum()
print('\nThe total available bikes in each cluster is:\n', total_available_bikes_cluster)
# Iterating over unique cluster IDs
for cluster_id in data['Cluster'].unique():
# Filtering data for stations belonging to the current cluster
cluster_data = data[data['Cluster'] == cluster_id]
# Calculating 'weight_initial' for stations in the current cluster
data.loc[data['Cluster'] == cluster_id, 'Weight_initial'] = \
cluster_data['AVAILABLE_BIKES'] * total_bike_stands_cluster.loc[cluster_id] / total_available_bikes_cluster.loc[cluster_id]
# Printing the head of the DataFrame with 'Weight_initial' column
print('\nThe dataset with initial weight values is:\n', data.head())
The total bike stands in each cluster is:
Cluster
1 183.0
2 190.0
3 293.0
4 286.0
5 288.0
6 230.0
Name: BIKE_STANDS, dtype: float64
The total available bikes in each cluster is:
Cluster
1 129.810959
2 77.578082
3 127.309589
4 56.827397
5 79.800000
6 47.367123
Name: AVAILABLE_BIKES, dtype: float64
The dataset with initial weight values is:
STATION ID NAME BIKE_STANDS AVAILABLE_BIKE_STANDS \
0 1 CLARENDON ROW 31.0 16.961644
1 2 BLESSINGTON STREET 20.0 14.687671
5 6 CHRISTCHURCH PLACE 20.0 13.504110
7 8 CUSTOM HOUSE QUAY 30.0 15.736986
8 9 EXCHEQUER STREET 24.0 13.287671
AVAILABLE_BIKES LATITUDE LONGITUDE Usage_predictions cluster_kmeans \
0 14.024658 53.3409 -6.26250 50.898496 1
1 5.304110 53.3568 -6.26814 18.881614 4
5 6.432877 53.3434 -6.27012 29.910910 2
7 9.128767 53.3479 -6.24805 50.995859 1
8 10.652055 53.3430 -6.26358 58.496224 1
Cluster Weight_initial
0 2 34.348425
1 4 26.694436
5 2 15.755050
7 3 21.009641
8 2 26.088431
C:\Users\Dell\AppData\Local\Temp\ipykernel_43472\1645359863.py:18: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
# Calculating average bike stands and available bikes for each cluster
average_bike_stands_cluster = data.groupby('Cluster')['BIKE_STANDS'].mean()
# Creating a new DataFrame to store the new rows
new_rows = pd.DataFrame(columns=data.columns)
# List of cluster IDs to iterate over
clusters_to_iterate = [1, 2, 3]
# Latitude and longitude values for the new rows
# Note that Google Streetview was used to identify exact location points on the map that optimise user experience
new_row_coords = [
(53.346459599999996, -6.292984499999996),
(53.34420817918841, -6.260144986133722),
(53.34742644494483, -6.255404095167318)
]
# Iterating over unique cluster IDs
for idx, cluster_id in enumerate(clusters_to_iterate):
# Creating a new row with average values for bike stands and available bikes
new_row = {
'STATION ID': f'new_{idx+1}',
'NAME': f'new_{idx+1}',
'BIKE_STANDS': average_bike_stands_cluster[cluster_id],
'AVAILABLE_BIKE_STANDS': 0,
'AVAILABLE_BIKES': average_bike_stands_cluster[cluster_id],
'LATITUDE': new_row_coords[idx][0],
'LONGITUDE': new_row_coords[idx][1],
'Usage_predictions': 0,
'Cluster': cluster_id,
'Weight_initial': 0
}
# Appending the new row to the new_rows DataFrame
new_rows = pd.concat([new_rows, pd.DataFrame([new_row])], ignore_index=True)
# Appending the new rows to the original DataFrame
data = pd.concat([data, new_rows], ignore_index=True)
# List of station IDs to remove
stations_to_remove = [116, 111, 89, 61, 104]
# Filtering out rows with the specified station IDs
data = data[~data['STATION ID'].isin(stations_to_remove)]
# Printing the updated DataFrame with new stations added and low usage stations removed
print('The updated dataset with new stations added and low usage stations removed is:\n', data)
The updated dataset with new stations added and low usage stations removed is:
STATION ID NAME BIKE_STANDS \
0 1 CLARENDON ROW 31.00
1 2 BLESSINGTON STREET 20.00
2 6 CHRISTCHURCH PLACE 20.00
3 8 CUSTOM HOUSE QUAY 30.00
4 9 EXCHEQUER STREET 24.00
5 10 DAME STREET 16.00
6 12 ECCLES STREET 20.00
7 13 FITZWILLIAM SQUARE WEST 30.00
8 14 FOWNES STREET UPPER 30.00
9 15 HARDWICKE STREET 16.00
10 16 GEORGES QUAY 20.00
11 17 GOLDEN LANE 20.00
12 19 HERBERT PLACE 30.00
13 20 JAMES STREET EAST 30.00
14 21 LEINSTER STREET SOUTH 30.00
15 22 TOWNSEND STREET 20.00
16 23 CUSTOM HOUSE 30.00
17 27 MOLESWORTH STREET 20.00
18 28 MOUNTJOY SQUARE WEST 30.00
19 29 ORMOND QUAY UPPER 29.00
20 30 PARNELL SQUARE NORTH 20.00
21 32 PEARSE STREET 30.00
22 33 PRINCES STREET / O'CONNELL STREET 23.00
23 38 TALBOT STREET 40.00
24 39 WILTON TERRACE 20.00
25 44 UPPER SHERRARD STREET 30.00
26 47 HERBERT STREET 40.00
27 59 DENMARK STREET GREAT 20.00
28 60 NORTH CIRCULAR ROAD 30.00
30 64 SANDWITH STREET 40.00
31 78 MATER HOSPITAL 40.00
32 79 ECCLES STREET EAST 27.00
33 86 PARKGATE STREET 38.00
35 92 HEUSTON BRIDGE (NORTH) 40.00
36 93 HEUSTON STATION (CENTRAL) 40.00
37 94 HEUSTON STATION (CAR PARK) 40.00
38 99 CITY QUAY 30.00
39 100 HEUSTON BRIDGE (SOUTH) 25.00
40 102 WESTERN WAY 40.00
41 103 GRANGEGORMAN LOWER (SOUTH) 40.00
43 105 GRANGEGORMAN LOWER (NORTH) 36.00
44 110 PHIBSBOROUGH ROAD 40.00
46 112 NORTH CIRCULAR ROAD (O'CONNELL'S) 30.00
47 114 WILTON TERRACE (PARK) 40.00
49 new_1 new_1 36.60
50 new_2 new_2 23.75
51 new_3 new_3 29.30
AVAILABLE_BIKE_STANDS AVAILABLE_BIKES LATITUDE LONGITUDE \
0 16.961644 14.024658 53.340900 -6.262500
1 14.687671 5.304110 53.356800 -6.268140
2 13.50411 6.432877 53.343400 -6.270120
3 15.736986 9.128767 53.347900 -6.248050
4 13.287671 10.652055 53.343000 -6.263580
5 6.967123 9.005479 53.344000 -6.266800
6 14.517808 5.482192 53.359200 -6.269780
7 24.90137 4.830137 53.336100 -6.252820
8 19.487671 10.476712 53.344600 -6.263370
9 13.054795 2.161644 53.355500 -6.264420
10 10.342466 9.646575 53.347500 -6.252190
11 14.454795 5.528767 53.340800 -6.267730
12 21.334247 8.383562 53.334400 -6.245570
13 24.931507 5.063014 53.336600 -6.248110
14 17.424658 12.539726 53.342200 -6.254490
15 8.967123 10.701370 53.345900 -6.254610
16 13.920548 15.695890 53.348300 -6.254660
17 12.531507 7.449315 53.341300 -6.258120
18 17.087671 12.865753 53.356300 -6.258590
19 14.936986 14.008219 53.346100 -6.268000
20 16.131507 3.789041 53.353500 -6.265310
21 17.252055 12.726027 53.344300 -6.250430
22 10.29589 12.657534 53.349000 -6.260310
23 15.484932 24.400000 53.351000 -6.252940
24 11.156164 8.643836 53.332400 -6.252720
25 20.673973 9.315068 53.358400 -6.260640
26 34.216438 5.389041 53.335700 -6.245510
27 15.30137 4.690411 53.355600 -6.261400
28 16.087671 13.849315 53.359600 -6.260350
30 24.682192 15.284932 53.345200 -6.247160
31 26.926027 13.035616 53.360000 -6.264830
32 22.813699 4.164384 53.358100 -6.265600
33 10.345205 27.630137 53.348000 -6.291800
35 5.506849 34.487671 53.347800 -6.292430
36 9.876712 30.106849 53.346600 -6.296920
37 20.030137 19.969863 53.347000 -6.297800
38 25.449315 4.528767 53.346600 -6.246150
39 7.375342 17.616438 53.347100 -6.292040
40 30.019178 9.641096 53.354900 -6.269420
41 34.964384 5.024658 53.354700 -6.278680
43 28.194521 7.802740 53.356000 -6.278380
44 28.452055 11.526027 53.356300 -6.273720
46 19.668493 10.317808 53.357800 -6.251560
47 27.139726 12.490411 53.333700 -6.248340
49 0 36.600000 53.346460 -6.292984
50 0 23.750000 53.344208 -6.260145
51 0 29.300000 53.347426 -6.255404
Usage_predictions cluster_kmeans Cluster Weight_initial
0 50.898496 1 2 34.348425
1 18.881614 4 4 26.694436
2 29.91091 2 2 15.75505
3 50.995859 1 3 21.009641
4 58.496224 1 2 26.088431
5 66.885357 5 2 22.055728
6 20.505096 4 4 27.590686
7 23.652421 2 6 23.453641
8 53.449347 1 2 25.658991
9 18.046534 4 5 7.801421
10 55.528331 1 3 22.201364
11 27.798568 2 2 13.540754
12 44.19326 3 6 40.707965
13 32.093773 0 6 24.584418
14 57.525088 1 3 28.859882
15 61.522133 5 3 24.628949
16 62.728942 5 3 36.12372
17 58.0668 1 2 18.244455
18 31.692939 0 5 46.432794
19 54.602995 1 2 34.308165
20 18.577317 4 4 19.069424
21 53.227569 1 3 29.28865
22 66.261155 5 3 29.131015
23 57.779409 1 3 56.156021
24 29.47649 2 6 41.971774
25 19.381672 4 5 33.618292
26 26.421215 2 6 26.167505
27 20.258112 4 5 16.927799
28 31.614857 0 5 49.98249
30 42.025642 3 3 35.177907
31 26.857023 2 5 47.045834
32 17.44585 4 5 15.029354
33 53.276721 1 1 38.951373
35 68.527906 5 1 48.618729
36 58.993286 1 1 42.442899
37 40.448883 3 1 28.152361
38 25.388022 2 3 10.42285
39 61.496249 5 1 24.834638
40 23.102577 2 4 48.52155
41 15.908157 4 4 25.288015
43 19.402861 4 4 39.269501
44 27.726214 2 4 58.008003
46 28.730846 2 5 37.237203
47 40.782061 3 6 60.649546
49 0 NaN 1 0
50 0 NaN 2 0
51 0 NaN 3 0
## Calculating the new weights of the stations in each cluster
# Calculating new total bike stands in each cluster
total_bike_stands_cluster_new = data.groupby('Cluster')['BIKE_STANDS'].sum()
print('The new total bike stands in each cluster is:\n', total_bike_stands_cluster_new)
# Calculating new total available bikes in each cluster using 'sum()'
total_available_bikes_cluster_new = data.groupby('Cluster')['AVAILABLE_BIKES'].sum()
print('\nThe new total available bikes in each cluster is:\n', total_available_bikes_cluster_new)
# Iterating over unique cluster IDs
for cluster_id in data['Cluster'].unique():
# Filtering data for stations belonging to the current cluster
cluster_data = data[data['Cluster'] == cluster_id]
# Calculating new weights ('Weight_new') for stations in the current cluster
data.loc[data['Cluster'] == cluster_id, 'Weight_new'] = \
cluster_data['AVAILABLE_BIKES'] * total_bike_stands_cluster_new.loc[cluster_id] / total_available_bikes_cluster_new.loc[cluster_id]
# Printing the head of the DataFrame with 'Weight_new' column
print('\nThe dataset with new weight values is:\n', data.head())
The new total bike stands in each cluster is: Cluster 1 219.60 2 213.75 3 322.30 4 216.00 5 223.00 6 190.00 Name: BIKE_STANDS, dtype: float64 The new total available bikes in each cluster is: Cluster 1 166.410959 2 101.328082 3 156.609589 4 48.569863 5 70.400000 6 44.800000 Name: AVAILABLE_BIKES, dtype: float64 The dataset with new weight values is: STATION ID NAME BIKE_STANDS AVAILABLE_BIKE_STANDS \ 0 1 CLARENDON ROW 31.0 16.961644 1 2 BLESSINGTON STREET 20.0 14.687671 2 6 CHRISTCHURCH PLACE 20.0 13.50411 3 8 CUSTOM HOUSE QUAY 30.0 15.736986 4 9 EXCHEQUER STREET 24.0 13.287671 AVAILABLE_BIKES LATITUDE LONGITUDE Usage_predictions cluster_kmeans \ 0 14.024658 53.3409 -6.26250 50.898496 1 1 5.304110 53.3568 -6.26814 18.881614 4 2 6.432877 53.3434 -6.27012 29.91091 2 3 9.128767 53.3479 -6.24805 50.995859 1 4 10.652055 53.3430 -6.26358 58.496224 1 Cluster Weight_initial Weight_new 0 2 34.348425 29.584795 1 4 26.694436 23.588448 2 2 15.75505 13.570053 3 3 21.009641 18.786855 4 2 26.088431 22.470343
# Setting the target variable
y = data['Usage_predictions'][:-3]
# Setting the feature variable
X_new = data[['Cluster', 'Weight_initial', 'AVAILABLE_BIKES']][:-3]
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)
models = {
"Ridge Base": Ridge(),
"Lasso Base": Lasso(),
"ElasticNet Base": ElasticNet(),
'Linear Regression': LinearRegression(), # Linear Regression
'Random Forest Regressor': RandomForestRegressor(random_state=42) # Random Forest Regressor
}
# Calculating the MSE value for all the models used
for name, model in models.items():
model.fit(X_train, y_train)
y_pred_test = model.predict(X_test)
y_pred_train = model.predict(X_train)
mse_test = mean_squared_error(y_test, y_pred_test)
mse_train = mean_squared_error(y_train, y_pred_train)
print(f"{name} (MSE_Test) = {mse_test}")
print(f"{name} (MSE_Train) = {mse_train}")
Ridge Base (MSE_Test) = 132.31537841706378 Ridge Base (MSE_Train) = 140.67315191054044 Lasso Base (MSE_Test) = 131.73887782082974 Lasso Base (MSE_Train) = 141.2981387637917 ElasticNet Base (MSE_Test) = 134.1657013607262 ElasticNet Base (MSE_Train) = 143.27351553912357 Linear Regression (MSE_Test) = 132.2945433689032 Linear Regression (MSE_Train) = 140.65894726553083 Random Forest Regressor (MSE_Test) = 71.91473220266676 Random Forest Regressor (MSE_Train) = 10.302148113753274
## Making new usage predictions for all stations
# Selecting features for prediction
X_pred = data[['Cluster', 'Weight_new', 'AVAILABLE_BIKES']]
# Renaming the 'Weight_new' column to 'Weight_initial' as per your requirement
X_pred = X_pred.rename(columns={'Weight_new': 'Weight_initial'})
# Making predictions using the Random Forest Regressor model
data['Usage_predictions_new'] = models['Random Forest Regressor'].predict(X_pred)
# Printing the head of the DataFrame with new usage predictions column
print('\nThe dataset with new usage prediction values is:\n', data.head())
The dataset with new usage prediction values is: STATION ID NAME BIKE_STANDS AVAILABLE_BIKE_STANDS \ 0 1 CLARENDON ROW 31.0 16.961644 1 2 BLESSINGTON STREET 20.0 14.687671 2 6 CHRISTCHURCH PLACE 20.0 13.50411 3 8 CUSTOM HOUSE QUAY 30.0 15.736986 4 9 EXCHEQUER STREET 24.0 13.287671 AVAILABLE_BIKES LATITUDE LONGITUDE Usage_predictions cluster_kmeans \ 0 14.024658 53.3409 -6.26250 50.898496 1 1 5.304110 53.3568 -6.26814 18.881614 4 2 6.432877 53.3434 -6.27012 29.91091 2 3 9.128767 53.3479 -6.24805 50.995859 1 4 10.652055 53.3430 -6.26358 58.496224 1 Cluster Weight_initial Weight_new Usage_predictions_new 0 2 34.348425 29.584795 52.804455 1 4 26.694436 23.588448 20.235560 2 2 15.75505 13.570053 33.546574 3 3 21.009641 18.786855 53.367074 4 2 26.088431 22.470343 58.369274
# Filtering out rows with the specified station IDs
data_use_new = data_pred[~data_pred['STATION ID'].isin(stations_to_remove)]
# Printing the updated DataFrame
print('The updated dataset with specified stations removed and new stations added is:\n', data_use_new)
The updated dataset with specified stations removed and new stations added is:
STATION ID NAME BIKE_STANDS \
0 1 CLARENDON ROW 31.0
1 2 BLESSINGTON STREET 20.0
2 3 BOLTON STREET 20.0
3 4 GREEK STREET 20.0
4 5 CHARLEMONT PLACE 40.0
.. ... ... ...
108 112 NORTH CIRCULAR ROAD (O'CONNELL'S) 30.0
109 113 MERRION SQUARE SOUTH 40.0
110 114 WILTON TERRACE (PARK) 40.0
111 115 KILLARNEY STREET 30.0
113 117 HANOVER QUAY EAST 40.0
AVAILABLE_BIKE_STANDS AVAILABLE_BIKES LATITUDE LONGITUDE \
0 16.961644 14.024658 53.3409 -6.26250
1 14.687671 5.304110 53.3568 -6.26814
2 12.476712 7.506849 53.3512 -6.26986
3 11.194521 8.775342 53.3469 -6.27298
4 12.410959 27.553425 53.3307 -6.26018
.. ... ... ... ...
108 19.668493 10.317808 53.3578 -6.25156
109 30.624658 7.463014 53.3386 -6.24861
110 27.139726 12.490411 53.3337 -6.24834
111 8.564384 21.402740 53.3548 -6.24758
113 31.742466 8.241096 53.3437 -6.23175
Usage_predictions cluster_kmeans Cluster
0 50.898496 1 2
1 18.881614 4 4
2 37.645840 0 None
3 47.050910 3 None
4 42.660139 3 None
.. ... ... ...
108 28.730846 2 5
109 27.959888 2 None
110 40.782061 3 6
111 46.850345 3 None
113 30.305323 2 None
[109 rows x 10 columns]
# Plotting heatmap corresponding to daily average usage level of bikes for each station (before)
fig = px.scatter_mapbox(data_use_new,
lat='LATITUDE',
lon='LONGITUDE',
hover_name='NAME',
hover_data=['Usage_predictions', 'STATION ID'],
color='Usage_predictions',
size='Usage_predictions',
size_max=20,
color_continuous_scale='Jet',
mapbox_style="carto-positron",
zoom=12)
# Updating layout
fig.update_layout(title='Daily Average Usage Percentage Heatmap for Dublin Bikes (before)')
# Showing the figure
fig.show()
# Merging the 'Usage_predictions_new' column from the 'data' DataFrame to the 'data_use_new' DataFrame based on the 'STATION ID' column
data_use_new = data_use_new.merge(data[['STATION ID', 'Usage_predictions_new']], on='STATION ID', how='left')
# Filling empty values in 'Usage_predictions_new' column with values from 'Usage_predictions' column
data_use_new['Usage_predictions_new'].fillna(data_use_new['Usage_predictions'], inplace=True)
# Station IDs to add
stations_to_add = ['new_1', 'new_2', 'new_3']
# Filtering the data DataFrame for rows where 'STATION ID' is in stations_to_add
rows_to_add = data[data['STATION ID'].isin(stations_to_add)]
# Adding these rows to data_use_new using concat
data_use_new = pd.concat([data_use_new, rows_to_add], ignore_index=True)
# Printing the updated 'data_use_new' DataFrame
print(data_use_new)
STATION ID NAME BIKE_STANDS AVAILABLE_BIKE_STANDS \
0 1 CLARENDON ROW 31.00 16.961644
1 2 BLESSINGTON STREET 20.00 14.687671
2 3 BOLTON STREET 20.00 12.476712
3 4 GREEK STREET 20.00 11.194521
4 5 CHARLEMONT PLACE 40.00 12.410959
.. ... ... ... ...
107 115 KILLARNEY STREET 30.00 8.564384
108 117 HANOVER QUAY EAST 40.00 31.742466
109 new_1 new_1 36.60 0
110 new_2 new_2 23.75 0
111 new_3 new_3 29.30 0
AVAILABLE_BIKES LATITUDE LONGITUDE Usage_predictions cluster_kmeans \
0 14.024658 53.340900 -6.262500 50.898496 1
1 5.304110 53.356800 -6.268140 18.881614 4
2 7.506849 53.351200 -6.269860 37.64584 0
3 8.775342 53.346900 -6.272980 47.05091 3
4 27.553425 53.330700 -6.260180 42.660139 3
.. ... ... ... ... ...
107 21.402740 53.354800 -6.247580 46.850345 3
108 8.241096 53.343700 -6.231750 30.305323 2
109 36.600000 53.346460 -6.292984 0 NaN
110 23.750000 53.344208 -6.260145 0 NaN
111 29.300000 53.347426 -6.255404 0 NaN
Cluster Usage_predictions_new Weight_initial Weight_new
0 2 52.804455 NaN NaN
1 4 20.235560 NaN NaN
2 None 37.645840 NaN NaN
3 None 47.050910 NaN NaN
4 None 42.660139 NaN NaN
.. ... ... ... ...
107 None 46.850345 NaN NaN
108 None 30.305323 NaN NaN
109 1 63.407113 0 48.298261
110 2 57.571571 0 50.100252
111 3 58.108833 0 60.298926
[112 rows x 13 columns]
# Filter data for new stations
data_new_stations = data_use_new[data_use_new['STATION ID'].isin(['new_1', 'new_2', 'new_3'])]
# Plotting heatmap corresponding to daily average usage level of bikes for each station (after)
fig = px.scatter_mapbox(data_use_new,
lat='LATITUDE',
lon='LONGITUDE',
hover_name='NAME',
hover_data=['Usage_predictions_new', 'STATION ID'],
color='Usage_predictions_new',
size='Usage_predictions_new',
size_max=20,
color_continuous_scale='Jet',
mapbox_style="carto-positron",
zoom=12,
title='Daily Average Usage Percentage Heatmap for Dublin Bikes (after)')
# Adding black circle markers for new stations for ease of identification
new_stations_trace = go.Scattermapbox(
lat=data_new_stations['LATITUDE'],
lon=data_new_stations['LONGITUDE'],
mode='markers',
marker=dict(symbol='circle', size=15, color='white'),
hoverinfo='text',
text=data_new_stations['NAME']
)
fig.add_trace(new_stations_trace)
# Showing the figure
fig.show()
# Saving the final dataset with new usage predictions
data_use_new.to_csv('dataset_with_new_predictions.csv', index=False)
# Selecting specific columns to display
grouped = data_use_new.groupby('Cluster').apply(lambda x: x[['Cluster', 'STATION ID', 'Usage_predictions', 'Usage_predictions_new']]).sort_values(by='Cluster').reset_index(drop=True)
# Printing the grouped data
print('Grouped data by Clusters with specific columns:\n', grouped)
Grouped data by Clusters with specific columns:
Cluster STATION ID Usage_predictions Usage_predictions_new
0 1 93 58.993286 55.402688
1 1 new_1 0 63.407113
2 1 100 61.496249 46.221735
3 1 86 53.276721 54.832878
4 1 92 68.527906 62.979385
5 1 94 40.448883 46.153871
6 2 new_2 0 57.571571
7 2 17 27.798568 33.103687
8 2 27 58.0668 43.154404
9 2 1 50.898496 52.804455
10 2 10 66.885357 57.484033
11 2 9 58.496224 58.369274
12 2 6 29.91091 33.546574
13 2 14 53.449347 59.072910
14 2 29 54.602995 53.556604
15 3 99 25.388022 32.183186
16 3 64 42.025642 50.658130
17 3 33 66.261155 61.805066
18 3 32 53.227569 57.805982
19 3 38 57.779409 58.553458
20 3 16 55.528331 53.474072
21 3 8 50.995859 53.367074
22 3 21 57.525088 60.319985
23 3 new_3 0 58.108833
24 3 23 62.728942 51.847799
25 3 22 61.522133 58.787304
26 4 30 18.577317 18.489099
27 4 103 15.908157 17.722522
28 4 102 23.102577 26.570461
29 4 12 20.505096 21.478624
30 4 105 19.402861 25.111618
31 4 110 27.726214 28.719501
32 4 2 18.881614 20.235560
33 5 78 26.857023 30.638531
34 5 28 31.692939 31.078592
35 5 15 18.046534 18.424242
36 5 60 31.614857 31.076083
37 5 59 20.258112 19.474751
38 5 44 19.381672 26.580429
39 5 79 17.44585 18.341248
40 5 112 28.730846 28.351047
41 6 114 40.782061 31.839393
42 6 39 29.47649 29.180011
43 6 20 32.093773 27.292227
44 6 47 26.421215 27.522670
45 6 19 44.19326 28.797845
46 6 13 23.652421 23.513533
C:\Users\Dell\AppData\Local\Temp\ipykernel_43472\1055471384.py:5: FutureWarning: Not prepending group keys to the result index of transform-like apply. In the future, the group keys will be included in the index, regardless of whether the applied function returns a like-indexed object. To preserve the previous behavior, use >>> .groupby(..., group_keys=False) To adopt the future behavior and silence this warning, use >>> .groupby(..., group_keys=True)